In [50]:

    
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import numpy as np
import requests
import time
import json
from requests_futures.sessions import FuturesSession
from collections import namedtuple

Preliminary Analysis

data is obtained from: http://www.bayareabikeshare.com/open-data



In [35]:

    
# read bike trip data

trp = pd.read_csv('data/trip.csv')
weather = pd.read_csv('data/weather.csv')
stations = pd.read_csv('data/station.csv')



In [ ]:

    
# calculate total trip count for each station

start_counts['station_id'] = start_counts.index
start_counts.columns = ['start counts', 'station_id']
end_counts['station_id'] = start_counts.index
end_counts.columns = ['end counts', 'station_id']

stations = stations.merge(start_counts)
stations = stations.merge(end_counts)
stations['total counts'] = stations['start counts'] + stations['end counts']



In [40]:

    
print 'total number of docks:', stations.dockcount.sum()
print 'total number of bikes:', trp['Bike #'].value_counts().shape[0]
# print trp['Subscription Type'].value_counts()









    



total number of docks: 1236
total number of bikes: 700

Explore the daily trips



In [5]:

    
# plot bike trips for each day

trp['Start Date'] = pd.to_datetime(trp['Start Date'])
trp['End Date'] = pd.to_datetime(trp['End Date'])

nday = trp['Start Date'].dt.date.nunique()
stat_days = trp['Start Date'].dt.date.value_counts()

fig, ax = plt.subplots(1, 1, figsize=(16, 8), sharex=True, sharey=True)
stat_days.plot()









    Out[5]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f715c6142d0>



In [6]:

    
# remove spurious trips that are too long, 
normal_trip = trp[trp.Duration < 10000]
normal_trip['minutes'] = normal_trip['Duration'] / 60 # in minutes









    



/opt/conda/lib/python2.7/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()

Explore the average trip time among annual subscribers and short-term customers



In [8]:

    
# plot the distribution of trip time, for annual subscriber and short-term customer
normal_trip[normal_trip['Subscription Type'] == 'Subscriber'].hist('minutes',bins=60, range=[0,60], alpha=0.5)
plt.legend(['subscriber'])
normal_trip[normal_trip['Subscription Type'] == 'Customer'].hist('minutes',bins=60, range=[0,60],alpha=0.5)
plt.legend(['customer'])









    Out[8]:





<matplotlib.legend.Legend at 0x7f7144fb1c50>

Trips on weekdays and weekend



In [9]:

    
# analyze trips on weekday versus weekend

trips_weekday = []
for i in range(7):
    trips_weekday.append([i, trp[ trp['Start Date'].dt.weekday == i ][trp['Subscription Type'] == 'Subscriber'].shape[0], trp[ trp['Start Date'].dt.weekday == i ][trp['Subscription Type'] == 'Customer'].shape[0] ])
trips_weekday = pd.DataFrame(trips_weekday)

fig = trips_weekday.plot(x=0,y=[1,2])
fig.legend(['subscriber','customer'])
fig.axes.set_xticklabels(['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'])
fig.set_title('trips by day of week')









    



/opt/conda/lib/python2.7/site-packages/pandas/core/frame.py:1825: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  "DataFrame index.", UserWarning)






    Out[9]:





<matplotlib.text.Text at 0x7f71445e61d0>

Collecting external data

1. location data from Google Maps API

2. public suggestion from

http://suggest.bayareabikeshare.com/page/about



In [24]:

    
# get elevations grom Google maps API

# def get_elevation(location):
#     url = 'https://maps.googleapis.com/maps/api/elevation/json?locations='
#     url = url + str(location[0]) + ',' + str(location[1])
#     response = requests.get(url)
#     elevation_dict = response.json()
#     return elevation_dict['results'][0]['elevation']


def GetElevs(locs):
    """
    input list of location tuples (lat, long)
    """
    url = 'https://maps.googleapis.com/maps/api/elevation/json?'
    locs_str = '|'.join( [','.join(map(str, loc)) for loc in locs] )
    params = {'locations': locs_str}
    response = requests.get(url, params=params)
    elevation_dict = response.json()
    return elevation_dict



In [25]:

    
locs = [loc for loc in zip( stations.lat, stations.long )]
elevation_dict = GetElev(locs)
elevations = [e['elevation'] for e in elevation_dict['results']]

# elevations = []
# for loc in zip( stations.lat, stations.long ):
#     elev = get_elevation(loc)
#     elevations.append( elev )
#     time.sleep(10)
#     print elev

stations['elevation'] = pd.DataFrame(elevations)

Explore if there is preference of downhill trips over uphill trips



In [27]:

    
# Distribution of downhill and uphill trips
# join the trip table, adding the start and end elevations, and elevation gain

stations.index = stations['station_id']

trp['start elevation'] = trp.join(stations,on='Start Terminal')['elevation']
trp['end elevation'] = trp.join(stations,on='End Terminal')['elevation']
trp['elevation gain'] = trp['end elevation'] - trp['start elevation']

trp['elevation gain'].hist(bins=20)









    Out[27]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f7144e06810>



In [29]:



In [ ]:

    
# get location features from Google maps API
def get_place(loc, pagetoken=None):
    url = 'https://maps.googleapis.com/maps/api/place/nearbysearch/json'
    params = {'pagetoken':pagetoken, 'location':'37.7817476,-122.4071379', 'radius': 500, 'key': key}
    response = requests.get(url, params=params)
    place_dict = response.json()
    return place_dict   #['results'][0]['elevation']

def radar(keyword, loc, radius, key):
    url = 'https://maps.googleapis.com/maps/api/place/radarsearch/json'
    loc = ','.join( map(str, loc) )
    params = {'keyword': keyword, 'location': loc, 'radius': radius, 'key': key}
#     response = requests.get(url, params=params)
    response = session.get(url, params=params)
    place_dict = response.json()
    return place_dict

def placedetail(placeid):
    url = 'https://maps.googleapis.com/maps/api/place/details/json'
    params = {'placeid': placeid, 'key': key}
    response = requests.get(url, params=params)
    place_dict = response.json()
    return place_dict   #['results'][0]['elevation']


# easy to be denied
def futureElevation(locs, key):
    url = 'https://maps.googleapis.com/maps/api/elevation/json?'
    session = FuturesSession(max_workers=20)
    params_list = []
    for loc in locs:
        loc_str = ','.join( map(str, loc) )
        params = {'locations': loc_str, 'key': key}
        params_list.append(params)
    futures = [session.get(url, params=params) for params in params_list]
    responses = [future.result() for future in futures]
    dicts = [response.json() for response in responses]
    return dicts



In [ ]:

    
# tests


# places = []
# for loc in zip( stations.lat, stations.long ):
#     elev = get_elevation(loc)
#     elevations.append( elev )
#     time.sleep(10)
#     print elev


# pagetoken = 'CoQC8wAAAMu7YnfnXbQOD3fH5yiww__3T6rVGhkxcMf4Jrx8K8ADkr6BtG4oZmO5QGpCU35b3nNY4jDMAEYX1xKnr7RKjdVGu3MO011AaMdhTFhnUNVI5WBPmiRGafmrThpZGqQZWfPB8gNQLOhSJhm8kH1yZ9rwXaJ-bmyaUHge_9I-b6GrHq0pHX4yXjfkrPtEbm6p7rSpAkIFwv-mIGewBqANCIzvkY1kRw1mBWWDkhBHFY-hKYawt6mb07aTov9k4g36f6nhjr2KFLXZ7NPXQF8FHpo_qrN_D30nE5607F6kDmnLaei8_VmDF_g1ieQyIRXhnRS7C7GwG1im9IsqjrX8NOgSEB6B_2zqwFL2OdbwScSzqDgaFM24Zs-R3b-_9rb8rt7IfcrTe7X3'
# place = get_place([0,0],pagetoken)
# pd.io.json.json_normalize(place)
# print place.keys()
# place['results']

# def allplaces(loc,raius=50):
    
#     return places

# loc = [0,0]
# results = []
# pagetoken = None
# while pagetoken != 'done':
#     response =  get_place(loc, pagetoken)
# #     print len(response['results'])
#     results.extend( response['results'] )
#     if response.has_key('next_page_token'):
#         pagetoken = response['next_page_token']
#     else:
#         pagetoken = 'done'
#     print pagetoken

# len(results)
# loc = [0,0]              
# search = radar(loc)
# print len(search['results'])
# print search['results']
# placedetail('ChIJN-dFBoaAhYAR8cYslWhgUJk')['result']


# loc = tuple(df.ix[1,1:3])
# loc = (37.7817476,  -122.4071379)
# radius = 300
# keyword = 'museum'
# print loc
# search = radar(keyword, loc, radius, key)
# print len( search['results'] )


# session = FuturesSession(max_workers=50)
# url = 'https://maps.googleapis.com/maps/api/place/radarsearch/json'
# params = {'keyword': keyword, 'location': loc, 'radius': radius, 'key': key}
# response = session.get(url, params = params)
# r1 = response.result()
# r1.json()



In [ ]:

    
with open('data/suggestions.csv', 'r') as f:
    df = pd.read_csv(f)
    
with open('secret/key', 'r') as f:
    key = f.read().replace('\n', '')

collect locations features for stations from Google Maps API



In [ ]:

    
# collect locations features for stations

def futureRadar(df, keyword, radius, key):
    url = 'https://maps.googleapis.com/maps/api/place/radarsearch/json'
    session = FuturesSession(max_workers=40)
    params_list = []
    for i in df.itertuples(index = False):
        loc = ','.join( map(str, i[2:4]) )
        params = {'keyword': keyword, 'location': loc, 'radius': radius, 'key': key}
        params_list.append(params)
    futures = [session.get(url, params=params) for params in params_list]
    responses = [future.result() for future in futures]
    dicts = [response.json() for response in responses]
    return dicts

keywords = ['train', 'bus', 'food', 'shopping', 'park', 'school']
radius = 300

session = FuturesSession(max_workers=50)
for keyword in keywords:
    dicts = futureRadar(stations, keyword, radius, key)
    stations[keyword] = [len(i['results']) for i in dicts]
    
# stations.to_csv('data/station_counts.csv')

generate grid points for prediction



In [44]:

    
# generate a grid representing SF, used for prediction
def GenGrid(lowerleft, upperright, npt):
    lat = np.linspace(lowerleft[0], upperright[0], npt)
    lng = np.linspace(lowerleft[1], upperright[1], npt)
    latv, lngv = np.meshgrid(lat, lng)
    locs = zip(latv.reshape(npt**2), lngv.reshape(npt**2))
    return locs

lowerleft = (37.71626, -122.513951)
upperright = (37.811164, -122.385373)
locs = GenGrid(lowerleft,upperright,35)

# visualize
x = [loc[0] for loc in locs]
y = [loc[1] for loc in locs]
plt.scatter(x,y)









    Out[44]:





<matplotlib.collections.PathCollection at 0x7f714520a1d0>



In [ ]:

    
# collect locations features for grid points

def futureRadar(locs, keyword, radius, key):
    url = 'https://maps.googleapis.com/maps/api/place/radarsearch/json'
    session = FuturesSession(max_workers=40)
    params_list = []
    for loc in locs:
        loc_str = ','.join( map(str, loc) )
        params = {'keyword': keyword, 'location': loc_str, 'radius': radius, 'key': key}
        params_list.append(params)
    futures = [session.get(url, params=params) for params in params_list]
    responses = [future.result() for future in futures]
    dicts = [response.json() for response in responses]
    return dicts

keywords = ['train', 'bus', 'food', 'shopping', 'park', 'school']
radius = 300

features = {}
for keyword in keywords:
    dicts = futureRadar(locs, keyword, radius, key)
    features[keyword] = [len(i['results']) for i in dicts]
    
elevations = []
for loc in locs:
    elev = get_elevation(loc)
    elevations.append( elev )
    time.sleep(10)
    
features['elevation'] = elevations



In [ ]:

    
grid_df = pd.DataFrame(features)

lat = [loc[0] for loc in locs]
lng = [loc[1] for loc in locs]
grid_df['lat'] = lat
grid_df['lng'] = lng

with open('data/grid.csv', 'w') as f:
    grid_df.to_csv(f, index=False)



In [ ]:

collect public suggestion data



In [49]:

    
# save all jsons to disk
url = 'http://suggest.bayareabikeshare.com/api/places'
pages = range(1,48)

for page in pages: 
    payload = {'page': page, 'include_submissions': 'false'}
    response = requests.get(url,payload)
    st_json = response.json()
    with open('data/suggestion-json/page'+str(page),'w') as f:
        json.dump(st_json, f)



In [ ]:

    
# parse out infomation from json, return a namedtuple
def parseJson(st_json):
    stationBasic = namedtuple('stationBasic', ['stid','lat','lng','weight'])
    stationBasics = []
    for i in range(len(st_json['features'])):
        stid = st_json['features'][i]['id']
        lat = st_json['features'][i]['geometry']['coordinates'][1]
        lng = st_json['features'][i]['geometry']['coordinates'][0]
        if st_json['features'][i]['properties']['submission_sets'].has_key('support') == True:
            weight = st_json['features'][i]['properties']['submission_sets']['support']['length']
        else:
            weight = 0
        stationBasics.append( stationBasic(stid = stid, lat = lat, lng = lng, weight = weight) )
    return stationBasics

# test = st_json
# testvalue = parseJson(test)
# test if has valid supports
# df = pd.DataFrame(result)
# df[3].value_counts()
# testvalue



In [ ]:

    
# read suggestions in jsons and parse out the needed data
pages = range(1,48)

parsed = []
for page in pages: 
    filename = 'data/suggestion-json/page' + str(page)
    with open(filename, 'r') as f:
        st_json = json.load(f)
#         print len(parseJson(st_json))
        parsed.extend( parseJson(st_json) )
        
columns = ['stid','lat','lng','weight']
df  = pd.DataFrame(parsed, columns=['stid','lat','lng','weight'])
df['weight'].sum()
df['lng'].describe()

df.to_csv('data/suggestions.csv',index = False)

Bay Area Bike Share System

Preliminary Analysis

Explore the daily trips

Explore the average trip time among annual subscribers and short-term customers

Trips on weekdays and weekend

Collecting external data

1. location data from Google Maps API

2. public suggestion from

Explore if there is preference of downhill trips over uphill trips

collect locations features for stations from Google Maps API

generate grid points for prediction

collect public suggestion data